Loading Packages

library(data.table) # Efficient Dataframe 
library(lubridate) # For Dates 
library(tidyverse) # Multiple Package for Useful Data wrangling
library(esquisse) # Intuitive plotting
library(plyr) # Data splitting
library(dplyr) # Data Wrangling
library(ggplot2) # Plot Graphs
library(naniar) # for NA exploration in Dataframe
library(plotly) # Make ggplot2 Dynamic
library(gridExtra) # Multiple Plot at once
library(RColorBrewer) # For Color Palette
library(rmdformats) # Theme of HTML
library(flextable) # Show Table
library(class) # K-NN
library(summarytools) # Beautiful and Efficient Summary for Dataset
library(pivottabler) # Pivot Table
library(naivebayes) # Naive Bayes Function
library(caret) # Confusion Matrix
library(leaps) # Exhaustive Search
library(forecast) # Predictions
library(neuralnet) # Neural Network
library(nnet) # Neural Network
library(manipulateWidget) # Plotly Combiner
library(rpart) # Regression Tree
library(rpart.plot) # Plotting Regression Tree

Dataset Preparation

Loading all dataset: Cereals.csv, eBayAuctions.csv and UniversalBank.csv

Cereals <- fread("data/Cereals.csv")
eBay <- fread("data/eBayAuctions.csv")
UniversalBank <- fread("data/UniversalBank.csv")

Cereals.csv

Quick Preview

# Preview of the Dataset
DT::datatable(head(Cereals,2))
dfSummary(Cereals, 
          plain.ascii  = FALSE, 
          style        = "grid", 
          graph.magnif = 0.75, 
          valid.col    = FALSE,
          tmp.img.dir  = "/tmp")

Data Frame Summary

Cereals

Dimensions: 77 x 16
Duplicates: 0

No Variable Stats / Values Freqs (% of Valid) Graph Missing
1 name
[character]
1. 100%_Bran
2. 100%_Natural_Bran
3. All-Bran
4. All-Bran_with_Extra_Fiber
5. Almond_Delight
6. Apple_Cinnamon_Cheerios
7. Apple_Jacks
8. Basic_4
9. Bran_Chex
10. Bran_Flakes
[ 67 others ]
1 ( 1.3%)
1 ( 1.3%)
1 ( 1.3%)
1 ( 1.3%)
1 ( 1.3%)
1 ( 1.3%)
1 ( 1.3%)
1 ( 1.3%)
1 ( 1.3%)
1 ( 1.3%)
67 (87.0%)
0
(0.0%)
2 mfr
[character]
1. A
2. G
3. K
4. N
5. P
6. Q
7. R
1 ( 1.3%)
22 (28.6%)
23 (29.9%)
6 ( 7.8%)
9 (11.7%)
8 (10.4%)
8 (10.4%)
0
(0.0%)
3 type
[character]
1. C
2. H
74 (96.1%)
3 ( 3.9%)
0
(0.0%)
4 calories
[integer]
Mean (sd) : 106.9 (19.5)
min < med < max:
50 < 110 < 160
IQR (CV) : 10 (0.2)
11 distinct values 0
(0.0%)
5 protein
[integer]
Mean (sd) : 2.5 (1.1)
min < med < max:
1 < 3 < 6
IQR (CV) : 1 (0.4)
1 : 13 (16.9%)
2 : 25 (32.5%)
3 : 28 (36.4%)
4 : 8 (10.4%)
5 : 1 ( 1.3%)
6 : 2 ( 2.6%)
0
(0.0%)
6 fat
[integer]
Mean (sd) : 1 (1)
min < med < max:
0 < 1 < 5
IQR (CV) : 2 (1)
0 : 27 (35.1%)
1 : 30 (39.0%)
2 : 14 (18.2%)
3 : 5 ( 6.5%)
5 : 1 ( 1.3%)
0
(0.0%)
7 sodium
[integer]
Mean (sd) : 159.7 (83.8)
min < med < max:
0 < 180 < 320
IQR (CV) : 80 (0.5)
27 distinct values 0
(0.0%)
8 fiber
[numeric]
Mean (sd) : 2.2 (2.4)
min < med < max:
0 < 2 < 14
IQR (CV) : 2 (1.1)
13 distinct values 0
(0.0%)
9 carbo
[numeric]
Mean (sd) : 14.8 (3.9)
min < med < max:
5 < 14.5 < 23
IQR (CV) : 5 (0.3)
21 distinct values 1
(1.3%)
10 sugars
[integer]
Mean (sd) : 7 (4.4)
min < med < max:
0 < 7 < 15
IQR (CV) : 8 (0.6)
16 distinct values 1
(1.3%)
11 potass
[integer]
Mean (sd) : 98.7 (70.4)
min < med < max:
15 < 90 < 330
IQR (CV) : 77.5 (0.7)
35 distinct values 2
(2.6%)
12 vitamins
[integer]
Mean (sd) : 28.2 (22.3)
min < med < max:
0 < 25 < 100
IQR (CV) : 0 (0.8)
0 : 8 (10.4%)
25 : 63 (81.8%)
100 : 6 ( 7.8%)
0
(0.0%)
13 shelf
[integer]
Mean (sd) : 2.2 (0.8)
min < med < max:
1 < 2 < 3
IQR (CV) : 2 (0.4)
1 : 20 (26.0%)
2 : 21 (27.3%)
3 : 36 (46.8%)
0
(0.0%)
14 weight
[numeric]
Mean (sd) : 1 (0.2)
min < med < max:
0.5 < 1 < 1.5
IQR (CV) : 0 (0.1)
0.50 : 2 ( 2.6%)
0.83 : 1 ( 1.3%)
1.00 : 64 (83.1%)
1.25 : 2 ( 2.6%)
1.30 : 1 ( 1.3%)
1.33 : 5 ( 6.5%)
1.50 : 2 ( 2.6%)
0
(0.0%)
15 cups
[numeric]
Mean (sd) : 0.8 (0.2)
min < med < max:
0.2 < 0.8 < 1.5
IQR (CV) : 0.3 (0.3)
12 distinct values 0
(0.0%)
16 rating
[numeric]
Mean (sd) : 42.7 (14)
min < med < max:
18 < 40.4 < 93.7
IQR (CV) : 17.7 (0.3)
77 distinct values 0
(0.0%)

Missing Variables Plot

# Missing Variables Plot for the Dataset
gg_miss_var(Cereals, show_pct = TRUE)

# NA % per variables
mean(is.na(Cereals$potass))
mean(is.na(Cereals$sugars))
mean(is.na(Cereals$carbo))

# NA % for whole dataseet
mean(is.na(Cereals))

Cereals.csv has some missing values, such as Potass (2.6%), Sugars (1.3%) and Carbo (1.3%). The dataset has a proportion of 0.32% missing values.

eBayAuction.csv

Quick Preview

# Preview of the Dataset
DT::datatable(head(eBay,2))
dfSummary(eBay, 
          plain.ascii  = FALSE, 
          style        = "grid", 
          graph.magnif = 0.75, 
          valid.col    = FALSE,
          tmp.img.dir  = "/tmp")

Data Frame Summary

eBay

Dimensions: 1972 x 8
Duplicates: 346

No Variable Stats / Values Freqs (% of Valid) Graph Missing
1 Category
[character]
1. Music/Movie/Game
2. Collectibles
3. Toys/Hobbies
4. Automotive
5. Antique/Art/Craft
6. SportingGoods
7. Clothing/Accessories
8. Home/Garden
9. Jewelry
10. Health/Beauty
[ 8 others ]
403 (20.4%)
239 (12.1%)
234 (11.9%)
178 ( 9.0%)
177 ( 9.0%)
124 ( 6.3%)
119 ( 6.0%)
102 ( 5.2%)
82 ( 4.2%)
64 ( 3.2%)
250 (12.7%)
0
(0.0%)
2 currency
[character]
1. EUR
2. GBP
3. US
533 (27.0%)
147 ( 7.5%)
1292 (65.5%)
0
(0.0%)
3 sellerRating
[integer]
Mean (sd) : 3560.2 (5973)
min < med < max:
0 < 1853 < 37727
IQR (CV) : 2785 (1.7)
461 distinct values 0
(0.0%)
4 Duration
[integer]
Mean (sd) : 6.5 (2.1)
min < med < max:
1 < 7 < 10
IQR (CV) : 2 (0.3)
1 : 23 ( 1.2%)
3 : 213 (10.8%)
5 : 466 (23.6%)
7 : 967 (49.0%)
10 : 303 (15.4%)
0
(0.0%)
5 endDay
[character]
1. Fri
2. Mon
3. Sat
4. Sun
5. Thu
6. Tue
7. Wed
287 (14.6%)
548 (27.8%)
351 (17.8%)
338 (17.1%)
202 (10.2%)
171 ( 8.7%)
75 ( 3.8%)
0
(0.0%)
6 ClosePrice
[numeric]
Mean (sd) : 36.4 (89.5)
min < med < max:
0 < 10 < 999
IQR (CV) : 23.1 (2.5)
827 distinct values 0
(0.0%)
7 OpenPrice
[numeric]
Mean (sd) : 12.9 (38.9)
min < med < max:
0 < 4.5 < 999
IQR (CV) : 8.8 (3)
291 distinct values 0
(0.0%)
8 Competitive?
[integer]
Min : 0
Mean : 0.5
Max : 1
0 : 906 (45.9%)
1 : 1066 (54.1%)
0
(0.0%)

Missing Variables Plot

# Missing Variables Plot for the Dataset
gg_miss_var(eBay, show_pct = TRUE)

eBayAuction.csv has no missing value.

UniversalBank.csv

Quick Preview

# Preview of the Dataset
DT::datatable(head(eBay,2))
dfSummary(eBay, 
          plain.ascii  = FALSE, 
          style        = "grid", 
          graph.magnif = 0.75, 
          valid.col    = FALSE,
          tmp.img.dir  = "/tmp")

Data Frame Summary

eBay

Dimensions: 1972 x 8
Duplicates: 346

No Variable Stats / Values Freqs (% of Valid) Graph Missing
1 Category
[character]
1. Music/Movie/Game
2. Collectibles
3. Toys/Hobbies
4. Automotive
5. Antique/Art/Craft
6. SportingGoods
7. Clothing/Accessories
8. Home/Garden
9. Jewelry
10. Health/Beauty
[ 8 others ]
403 (20.4%)
239 (12.1%)
234 (11.9%)
178 ( 9.0%)
177 ( 9.0%)
124 ( 6.3%)
119 ( 6.0%)
102 ( 5.2%)
82 ( 4.2%)
64 ( 3.2%)
250 (12.7%)
0
(0.0%)
2 currency
[character]
1. EUR
2. GBP
3. US
533 (27.0%)
147 ( 7.5%)
1292 (65.5%)
0
(0.0%)
3 sellerRating
[integer]
Mean (sd) : 3560.2 (5973)
min < med < max:
0 < 1853 < 37727
IQR (CV) : 2785 (1.7)
461 distinct values 0
(0.0%)
4 Duration
[integer]
Mean (sd) : 6.5 (2.1)
min < med < max:
1 < 7 < 10
IQR (CV) : 2 (0.3)
1 : 23 ( 1.2%)
3 : 213 (10.8%)
5 : 466 (23.6%)
7 : 967 (49.0%)
10 : 303 (15.4%)
0
(0.0%)
5 endDay
[character]
1. Fri
2. Mon
3. Sat
4. Sun
5. Thu
6. Tue
7. Wed
287 (14.6%)
548 (27.8%)
351 (17.8%)
338 (17.1%)
202 (10.2%)
171 ( 8.7%)
75 ( 3.8%)
0
(0.0%)
6 ClosePrice
[numeric]
Mean (sd) : 36.4 (89.5)
min < med < max:
0 < 10 < 999
IQR (CV) : 23.1 (2.5)
827 distinct values 0
(0.0%)
7 OpenPrice
[numeric]
Mean (sd) : 12.9 (38.9)
min < med < max:
0 < 4.5 < 999
IQR (CV) : 8.8 (3)
291 distinct values 0
(0.0%)
8 Competitive?
[integer]
Min : 0
Mean : 0.5
Max : 1
0 : 906 (45.9%)
1 : 1066 (54.1%)
0
(0.0%)

Missing Variables Plot

# Missing Variables Plot for the Dataset
gg_miss_var(eBay, show_pct = TRUE)

UniversalBank.csv has no missing value.

Ex 13.1

Acceptance of Consumer Loan

# Loading Universal.csv
UniversalBank <- fread("data/UniversalBank.csv")

# Removing ZIP Code and ID
UniversalBank <- UniversalBank[,-c("ZIP Code","ID")]

Partition the data: 60% training, 40% validation.

# Setting Seed
set.seed(1)

# Training and Validation Proportion
Training_Proportion <- 0.6
Validation_Proportion <- 1-Training_Proportion

# Splitting
sample <- sample(c(TRUE, FALSE), nrow(UniversalBank), replace=TRUE, prob=c(Training_Proportion,Validation_Proportion))

UB_Training  <- UniversalBank[sample, ]

UB_Validation   <- UniversalBank[!sample, ]

# Checking Proportions
Training_Proportion_Check <- nrow(UB_Training)/nrow(UniversalBank)
Validation_Proportion_Check <- nrow(UB_Validation)/nrow(UniversalBank)

# Printing Result Check
print(paste("Proportion in Training is", Training_Proportion_Check*100,"%", "and in Validation is",Validation_Proportion_Check*100,"%"))
FALSE [1] "Proportion in Training is 61 % and in Validation is 39 %"

a. Fit Models to the data for (1) logistic regression, (2) k-nearest neighbors with k = 3, and (3) Classification trees.

Use Personal Loan as the outcome variable. Report the validation confusion matrix for each of the three models.

Logistic Regression

# Set Seed 
set.seed(1)

# Duplicate the UniversalBank
LR_Training <- UB_Training
LR_Validation <- UB_Validation

# Target Variable As Factor
LR_Training$`Personal Loan` <- factor(LR_Training$`Personal Loan`, levels = c(0,1),labels = c("No Loan","Loan")) 
LR_Validation$`Personal Loan` <- factor(LR_Validation$`Personal Loan`, levels = c(0,1),labels = c("No Loan","Loan"))

# Factor Education
LR_Training$Education <- factor(LR_Training$Education, levels = c(1, 2, 3), labels = c("Undergrad", "Graduate", "Advanced/Professional"))
LR_Validation$Education <- factor(LR_Validation$Education, levels = c(1, 2, 3), labels = c("Undergrad", "Graduate", "Advanced/Professional"))

# Run logistic regression
logit.reg <- glm(`Personal Loan` ~ ., data = LR_Training, family = "binomial")

options(scipen=999)

summary(logit.reg)
FALSE 
FALSE Call:
FALSE glm(formula = `Personal Loan` ~ ., family = "binomial", data = LR_Training)
FALSE 
FALSE Deviance Residuals: 
FALSE     Min       1Q   Median       3Q      Max  
FALSE -2.1853  -0.1818  -0.0672  -0.0212   4.1974  
FALSE 
FALSE Coefficients:
FALSE                                   Estimate  Std. Error z value
FALSE (Intercept)                    -10.3177782   2.5770778  -4.004
FALSE Age                             -0.1295287   0.0971009  -1.334
FALSE Experience                       0.1462520   0.0961873   1.520
FALSE Income                           0.0589502   0.0038257  15.409
FALSE Family                           0.6028245   0.1013175   5.950
FALSE CCAvg                            0.2018200   0.0586504   3.441
FALSE EducationGraduate                3.9264743   0.3520393  11.154
FALSE EducationAdvanced/Professional   4.0665622   0.3382094  12.024
FALSE Mortgage                         0.0012640   0.0007709   1.640
FALSE `Securities Account`            -0.5440472   0.3724749  -1.461
FALSE `CD Account`                     3.7374014   0.4324751   8.642
FALSE Online                          -0.5841282   0.2188906  -2.669
FALSE CreditCard                      -0.8865331   0.2775008  -3.195
FALSE                                            Pr(>|z|)    
FALSE (Intercept)                           0.00006236641 ***
FALSE Age                                        0.182217    
FALSE Experience                                 0.128387    
FALSE Income                         < 0.0000000000000002 ***
FALSE Family                                0.00000000268 ***
FALSE CCAvg                                      0.000579 ***
FALSE EducationGraduate              < 0.0000000000000002 ***
FALSE EducationAdvanced/Professional < 0.0000000000000002 ***
FALSE Mortgage                                   0.101073    
FALSE `Securities Account`                       0.144118    
FALSE `CD Account`                   < 0.0000000000000002 ***
FALSE Online                                     0.007617 ** 
FALSE CreditCard                                 0.001400 ** 
FALSE ---
FALSE Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
FALSE 
FALSE (Dispersion parameter for binomial family taken to be 1)
FALSE 
FALSE     Null deviance: 1947.62  on 3049  degrees of freedom
FALSE Residual deviance:  686.36  on 3037  degrees of freedom
FALSE AIC: 712.36
FALSE 
FALSE Number of Fisher Scoring iterations: 8

Predictions and Confusion Matrix

# Set Seed 
set.seed(1)

# Predictions with LR
logit.reg.pred <- predict(logit.reg, LR_Validation[, -c("`Personal Loan`")], type = "response")

# Rounding Predictions - 0.5 Threshold
logit.reg.pred_round <- round(logit.reg.pred)

# As Numeric
logit.reg.pred_round <- as.numeric(logit.reg.pred_round)

# Check rounding
dataframe_predictions <- cbind(logit.reg.pred, logit.reg.pred_round)

# Predictions as Factor
logit.reg.pred_round <- factor(logit.reg.pred_round, levels = c(0,1),labels = c("No Loan","Loan"))

# Confusion Matrix
Confusion_Matrix_LR <- confusionMatrix(data = logit.reg.pred_round, reference = LR_Validation$`Personal Loan`)

# Create the Function for Confusion Matrix
draw_confusion_matrix_LR <- function(cm) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('CONFUSION MATRIX for Logistic Regression', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#1c6155')
  text(195, 435, 'No Loan', cex=1.2)
  rect(250, 430, 340, 370, col='#1c615570')
  text(295, 435, 'Loan', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#1c615570')
  rect(250, 305, 340, 365, col='#1c6155')
  text(140, 400, 'No Loan', cex=1.2, srt=90)
  text(140, 335, 'Loan', cex=1.2, srt=90)

  # add in the cm results 
  res <- as.numeric(cm$table)
  text(195, 400, res[1], cex=1.6, font=2, col='white')
  text(195, 335, res[2], cex=1.6, font=2, col='white')
  text(295, 400, res[3], cex=1.6, font=2, col='white')
  text(295, 335, res[4], cex=1.6, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}

# Plot the Confusion Matrix
draw_confusion_matrix_LR(Confusion_Matrix_LR)

Comments:

K-nearest neighbors with k = 3

Data Preprocessing

# Set Seed 
set.seed(1)

# Target Variable As Factor
UB_Training$`Personal Loan` <- factor(UB_Training$`Personal Loan`, levels = c(0,1),labels = c("No Loan","Loan")) 

UB_Validation$`Personal Loan` <- factor(UB_Validation$`Personal Loan`, levels = c(0,1),labels = c("No Loan","Loan"))

# Education as Factor
UB_Training$Education <- factor(UB_Training$Education)

UB_Validation$Education <- factor(UB_Validation$Education)

# Library Caret
library(caret)

# Dummy for Education
dummy <- dummyVars(" ~ Education", data=UB_Training)
Education_Dummy <- data.frame(predict(dummy, newdata = UB_Training)) 

dummy_2 <- dummyVars(" ~ Education", data=UB_Validation)
Education_Dummy_2 <- data.frame(predict(dummy, newdata = UB_Validation)) 

# Remove Education and add hot-encoded dummies 
UB_Training <- UB_Training[,-c("Education")]
UB_Training <- cbind(UB_Training,Education_Dummy)

UB_Validation <- UB_Validation[,-c("Education")]
UB_Validation <- cbind(UB_Validation,Education_Dummy_2)

# Duplicate Dataset for Preprocess
UB_Training_Preprocess <- UB_Training
UB_Validation_Preprocess <- UB_Validation

# Remove Dummies
UB_Training_Preprocess <- UB_Training_Preprocess[,-c("Personal Loan","Online","CreditCard","Securities Account","CD Account","Education.1","Education.2","Education.3")]

# Preprocess 
norm_values <- preProcess(UB_Training_Preprocess, method = c("center","scale"))

UB_Training_Preprocess <- predict(norm_values,UB_Training)
UB_Validation_Preprocess <- predict(norm_values, UB_Validation)

Predictions and Confusion Matrix

# Set Seed 
set.seed(1)

# KNN Model using class package
library(class)

# Choosing our K value
k <- 3

# KNN
Predictions_KNN_Training <- knn(train=UB_Training_Preprocess[,-c("Personal Loan")], test = UB_Validation_Preprocess[,-c("Personal Loan")], cl = UB_Training_Preprocess$`Personal Loan`, k=k, prob = TRUE)

# KNN Probabilities output
Predictions_KNN_Training_Proba <- attributes(Predictions_KNN_Training)$prob
Predictions_KNN_Training_Proba <- as.data.frame(Predictions_KNN_Training_Proba)

# Change 1 to 0 since 1 is for Dummy == No Loan not Loan
Predictions_KNN_Training_Proba[Predictions_KNN_Training_Proba == 1] <- 0

# Confusion Matrix
Confusion_Matrix_KNN <- confusionMatrix(data = Predictions_KNN_Training, reference = UB_Validation$`Personal Loan`)

# Create the Function for Confusion Matrix
draw_confusion_matrix_KNN <- function(cm) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('CONFUSION MATRIX for KNN with k=3', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#1c6155')
  text(195, 435, 'No Loan', cex=1.2)
  rect(250, 430, 340, 370, col='#1c615570')
  text(295, 435, 'Loan', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#1c615570')
  rect(250, 305, 340, 365, col='#1c6155')
  text(140, 400, 'No Loan', cex=1.2, srt=90)
  text(140, 335, 'Loan', cex=1.2, srt=90)

  # add in the cm results 
  res <- as.numeric(cm$table)
  text(195, 400, res[1], cex=1.6, font=2, col='white')
  text(195, 335, res[2], cex=1.6, font=2, col='white')
  text(295, 400, res[3], cex=1.6, font=2, col='white')
  text(295, 335, res[4], cex=1.6, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}

# Plot the Confusion Matrix
draw_confusion_matrix_KNN(Confusion_Matrix_KNN)

Comments:

Classification Trees

# Set Seed 
set.seed(1)

# Tree Packages
library(rpart)
library(rpart.plot)

# Running Tree
CT <- rpart(UB_Training$`Personal Loan` ~ . , data=UB_Training)

# Plotting Tree
CT_Plot <- rpart.plot(CT, type=0, varlen = 0, box.col=ifelse(CT$frame$var == "<leaf>", '#8db0aa', 'white'), fallen.leaves = FALSE, extra = FALSE, main="Classification Trees", cex.main=1.5)

Predictions and Confusion Matrix

# Set Seed 
set.seed(1)

# Predictions 
CT_Predictions <- predict(CT, UB_Validation, type = 'class')
CT_Predictions_Proba <- predict(CT, UB_Validation, type = 'prob')

# Confusion Matrix
Confusion_Matrix_CT <- confusionMatrix(data = CT_Predictions, reference = UB_Validation$`Personal Loan`)

# Create the Function for Confusion Matrix
draw_confusion_matrix_CT <- function(cm) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('CONFUSION MATRIX for Classification Trees', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#1c6155')
  text(195, 435, 'No Loan', cex=1.2)
  rect(250, 430, 340, 370, col='#1c615570')
  text(295, 435, 'Loan', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#1c615570')
  rect(250, 305, 340, 365, col='#1c6155')
  text(140, 400, 'No Loan', cex=1.2, srt=90)
  text(140, 335, 'Loan', cex=1.2, srt=90)

  # add in the cm results 
  res <- as.numeric(cm$table)
  text(195, 400, res[1], cex=1.6, font=2, col='white')
  text(195, 335, res[2], cex=1.6, font=2, col='white')
  text(295, 400, res[3], cex=1.6, font=2, col='white')
  text(295, 335, res[4], cex=1.6, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}

# Plot the Confusion Matrix
draw_confusion_matrix_CT(Confusion_Matrix_CT)

Comments:

Most Accurate Model

Accuracy_CT <- Confusion_Matrix_CT$overall[1]

Accuracy_KNN <- Confusion_Matrix_KNN$overall[1]

Accuracy_LR <- Confusion_Matrix_LR$overall[1]

Accuracy_DF <- data.frame(Accuracy_CT,Accuracy_KNN,Accuracy_LR)

Accuracy_DF <- as.data.frame(Accuracy_DF)

# Plotting Accuracy
data.table(Accuracy_DF)
FALSE    Accuracy_CT Accuracy_KNN Accuracy_LR
FALSE 1:   0.9815385    0.9692308   0.9487179

Comments: The most accurate model is the Classification Trees, having at least 98% accurate predictions (Training against Validation), the lowest accuracy model is the Logistic Regression with 95% Accuracy.

b. Create Dataframe with the actual outcome, predicted outcome, and each of the tree models

# Creating Actual VERSUS Predicted Dataframe
LR_Dataframe <- data.frame(logit.reg.pred_round)
KNN_Dataframe <- data.frame(Predictions_KNN_Training)
CT_Dataframe <- data.frame(CT_Predictions)

All_Models_Predictions <- cbind(LR_Dataframe,KNN_Dataframe,CT_Dataframe, UB_Validation$`Personal Loan`)

# Changing Columns Name
colnames(All_Models_Predictions)[1] <- "Predicted - LR"
colnames(All_Models_Predictions)[2] <- "Predicted - KNN"
colnames(All_Models_Predictions)[3] <- "Predicted - CT"
colnames(All_Models_Predictions)[4] <- "Actual"

# Showing Dataframes 
DT::datatable(All_Models_Predictions, caption = "Predicted VS Actual Loan in Validation - 3 Models") 

c. Add two columns, a majority vote predicted of outcome and the average of predited probabilities. Derive a confusion matrix for each method and report the overall accuracy.

Majority Vote Column

# Majority Column
All_Models_Predictions$Majority <- 0

# As Numeric LR
All_Models_Predictions$`Predicted - LR` <- as.numeric(All_Models_Predictions$`Predicted - LR`)
All_Models_Predictions$`Predicted - LR` <- All_Models_Predictions$`Predicted - LR` -1

# As Numeric KNN
All_Models_Predictions$`Predicted - KNN` <- as.numeric(All_Models_Predictions$`Predicted - KNN`)
All_Models_Predictions$`Predicted - KNN` <- All_Models_Predictions$`Predicted - KNN` -1

# As Numeric CT
All_Models_Predictions$`Predicted - CT` <- as.numeric(All_Models_Predictions$`Predicted - CT`)
All_Models_Predictions$`Predicted - CT` <- All_Models_Predictions$`Predicted - CT` -1

# As Numeric Actual
All_Models_Predictions$Actual <- as.numeric(All_Models_Predictions$Actual)
All_Models_Predictions$Actual <- All_Models_Predictions$Actual -1

# Majority Vote
for(i in 1:nrow(All_Models_Predictions)){
  if(sum(All_Models_Predictions[i,1] + All_Models_Predictions[i,2] + All_Models_Predictions[i,3])/3 > 0.5){
    All_Models_Predictions[i,5] = 1
    }
}

# As Factor again
All_Models_Predictions$`Predicted - LR` <- factor(All_Models_Predictions$`Predicted - LR`, levels = c(0,1),labels = c("No Loan","Loan"))
All_Models_Predictions$`Predicted - KNN` <- factor(All_Models_Predictions$`Predicted - KNN`, levels = c(0,1),labels = c("No Loan","Loan"))
All_Models_Predictions$`Predicted - CT` <- factor(All_Models_Predictions$`Predicted - CT`, levels = c(0,1),labels = c("No Loan","Loan"))
All_Models_Predictions$Actual <- factor(All_Models_Predictions$Actual, levels = c(0,1),labels = c("No Loan","Loan"))
All_Models_Predictions$Majority <- factor(All_Models_Predictions$Majority, levels = c(0,1),labels = c("No Loan","Loan"))

# Show Majority Column
DT::datatable(All_Models_Predictions, caption = "Predicted VS Actual Loan in Validation (Majority Vote Added) - 3 Models") 

Confusion Matrix with the Majority Vote

# Confusion Matrix
Confusion_Matrix_Majority <- confusionMatrix(data = All_Models_Predictions$Majority, reference = All_Models_Predictions$Actual)

# Create the Function for Confusion Matrix
draw_confusion_matrix_Majority <- function(cm) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('CONFUSION MATRIX for Majority of Votes', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#1c6155')
  text(195, 435, 'No Loan', cex=1.2)
  rect(250, 430, 340, 370, col='#1c615570')
  text(295, 435, 'Loan', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#1c615570')
  rect(250, 305, 340, 365, col='#1c6155')
  text(140, 400, 'No Loan', cex=1.2, srt=90)
  text(140, 335, 'Loan', cex=1.2, srt=90)

  # add in the cm results 
  res <- as.numeric(cm$table)
  text(195, 400, res[1], cex=1.6, font=2, col='white')
  text(195, 335, res[2], cex=1.6, font=2, col='white')
  text(295, 400, res[3], cex=1.6, font=2, col='white')
  text(295, 335, res[4], cex=1.6, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}

# Plot the Confusion Matrix
draw_confusion_matrix_Majority(Confusion_Matrix_Majority)

Comments: The Majority Vote has an accuracy of 97.2%, which is slighty better than RT and KNN but less accurate than CT.

Average of Predicted Probabilities

All_Models_Predictions$Average <- NA

# Average Vote
for(i in 1:nrow(All_Models_Predictions)){
  All_Models_Predictions$Average[i] =  (CT_Predictions_Proba[i,2] + logit.reg.pred[i] + Predictions_KNN_Training_Proba[i,1])/3
}

# Round Average Vote to 4th Decimals
All_Models_Predictions$Average <- round(All_Models_Predictions[,6],4)

# Show Average Column with Probabilities
DT::datatable(All_Models_Predictions, caption = "Predicted VS Actual Loan in Validation (Average Probabilities Added) - 3 Models") 
# Change Probabilities to 0 or 1 with cutoff = 0.5
for(i in 1:nrow(All_Models_Predictions)){
  if(All_Models_Predictions[i,6] > 0.5){
    All_Models_Predictions[i,6] = 1
  }
  else{ 
    All_Models_Predictions[i,6] = 0
    }
}

# As Factor Average
All_Models_Predictions$Average <- factor(All_Models_Predictions$Average, levels = c(0,1),labels = c("No Loan","Loan"))

# Show Average Column with Probabilities
DT::datatable(All_Models_Predictions, caption = "Predicted VS Actual Loan in Validation (Average Added) - 3 Models") 

Confusion Matrix with the Average Predicted Probabilities

# Confusion Matrix
Confusion_Matrix_Average <- confusionMatrix(data = All_Models_Predictions$Average, reference = All_Models_Predictions$Actual)

# Create the Function for Confusion Matrix
draw_confusion_matrix_Average <- function(cm) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('CONFUSION MATRIX for Average Predicted Probabilities', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#1c6155')
  text(195, 435, 'No Loan', cex=1.2)
  rect(250, 430, 340, 370, col='#1c615570')
  text(295, 435, 'Loan', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#1c615570')
  rect(250, 305, 340, 365, col='#1c6155')
  text(140, 400, 'No Loan', cex=1.2, srt=90)
  text(140, 335, 'Loan', cex=1.2, srt=90)

  # add in the cm results 
  res <- as.numeric(cm$table)
  text(195, 400, res[1], cex=1.6, font=2, col='white')
  text(195, 335, res[2], cex=1.6, font=2, col='white')
  text(295, 400, res[3], cex=1.6, font=2, col='white')
  text(295, 335, res[4], cex=1.6, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}

# Plot the Confusion Matrix
draw_confusion_matrix_Average(Confusion_Matrix_Average)

Comments:

d. Compare Error Rates for the 3 individuals methods and the 2 ensemble methods.

# Computing Error Rates

# Individual Methods
Error_CT <- 1-Accuracy_CT
Error_KNN <- 1-Accuracy_KNN
Error_LR <- 1-Accuracy_LR

# Ensembles Methods
Error_Majority <- 1-Confusion_Matrix_Majority$overall[1]
Error_Average <- 1-Confusion_Matrix_Average $overall[1]

# As Data frame 
Error_Rates_DF <- cbind(Error_LR,Error_KNN,Error_CT,Error_Majority,Error_Average)

# Rounding up to 5 decimals
Error_Rates_DF <- round(Error_Rates_DF,3)

DT::datatable(Error_Rates_DF, caption = "Error Rates from 3 Individual Methods and 2 Ensembles Methods") 

Comments:

Ex 13.2

eBay Auctions - Boosting and Bagging

# Loading eBay Auctions
eBayAuction_DF <- fread("data/eBayAuctions.csv")

Partition the data: 60% training, 40% validation.

# Setting Seed
set.seed(1)

# Training and Validation Proportion
Training_Proportion <- 0.6
Validation_Proportion <- 1-Training_Proportion

# Splitting
sample <- sample(c(TRUE, FALSE), nrow(eBayAuction_DF), replace=TRUE, prob=c(Training_Proportion,Validation_Proportion))

EB_Training  <- eBayAuction_DF[sample, ]

EB_Validation   <- eBayAuction_DF[!sample, ]

# Checking Proportions
Training_Proportion_Check <- nrow(EB_Training)/nrow(eBayAuction_DF)
Validation_Proportion_Check <- nrow(EB_Validation)/nrow(eBayAuction_DF)

# Printing Result Check
print(paste("Proportion in Training is", round(Training_Proportion_Check*100),"%", "and in Validation is",round(Validation_Proportion_Check*100),"%"))
FALSE [1] "Proportion in Training is 61 % and in Validation is 39 %"

a. Run a Classification Tree, using default controls of rpart()

Actual VS Predictions

# Set Seed 
set.seed(1)

# Tree Packages
library(rpart)
library(rpart.plot)

# Duplicate before Factor
EB_Training_Pure <- EB_Training
EB_Validation_Pure <- EB_Validation

# Factor for outcome variable 
EB_Training$`Competitive?` <- factor(EB_Training$`Competitive?`, levels = c(0,1),labels = c("No","Yes")) 
EB_Validation$`Competitive?` <- factor(EB_Validation$`Competitive?`, levels = c(0,1),labels = c("No","Yes")) 

# Running Classification Tree
CT <- rpart(EB_Training$`Competitive?` ~ . , data=EB_Training)

# Predict for Validation Set with CT
CT_Predictions <- predict(CT, EB_Validation, type = 'class')
CT_Predictions_Prob <- predict(CT, EB_Validation, type = 'prob')

# Check Actual VS Predicted
Predictions_Actual_CT <- data.frame(EB_Validation$`Competitive?`,CT_Predictions)

# Renaming Columns
colnames(Predictions_Actual_CT)[2] <- "Predicted"
colnames(Predictions_Actual_CT)[1] <- "Actual"

# Render Table
DT::datatable(Predictions_Actual_CT, caption = "Actual VS Predicted with Default Classification Trees") 

Confusion Matrix

# Set seed
set.seed(1)

# Confusion Matrix
Confusion_Matrix_CT_2 <- confusionMatrix(data = CT_Predictions, reference = EB_Validation$`Competitive?`)

# Create the Function for Confusion Matrix
draw_confusion_matrix_CT_2 <- function(cm) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('CONFUSION MATRIX for Default Classification Trees', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#1c6155')
  text(195, 435, 'No', cex=1.2)
  rect(250, 430, 340, 370, col='#1c615570')
  text(295, 435, 'Yes', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#1c615570')
  rect(250, 305, 340, 365, col='#1c6155')
  text(140, 400, 'No', cex=1.2, srt=90)
  text(140, 335, 'Yes', cex=1.2, srt=90)

  # add in the cm results 
  res <- as.numeric(cm$table)
  text(195, 400, res[1], cex=1.6, font=2, col='white')
  text(195, 335, res[2], cex=1.6, font=2, col='white')
  text(295, 400, res[3], cex=1.6, font=2, col='white')
  text(295, 335, res[4], cex=1.6, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}

# Plot the Confusion Matrix
draw_confusion_matrix_CT_2(Confusion_Matrix_CT_2)

Comments: The Overall Accuracy is 0.81 for this Default Classification Trees.

Lift Chart

# Set seed
set.seed(1)

# Load Gains
library(gains)

# Compute Gains Chart
gain <- gains(EB_Validation_Pure$`Competitive?`, CT_Predictions_Prob[,2])
barplot(gain$mean.resp / mean(EB_Validation_Pure$`Competitive?`), names.arg = gain$depth, xlab = "Percentile",
ylab = "Mean Response", main = "Decile-wise lift chart", col = "#1c6155")

Comments:

b. Run a boosted tree with the same predictors

# Set seed
set.seed(1)

library(adabag)

#Model_Boost <- boosting(`Competitive?` ~ . , data = EB_Training_Pure)

#Prediction_Boost <- predict(Model_Boost, EB_Validation)

Comments: The Overall Accuracy is …

Lift Chart

Comments:

c. Run a bagged tree with the same predictors

# Set seed
set.seed(1)

# Load Package
library(adabag)

# Predictions
bag <- bagging(`Competitive?` ~ ., data = EB_Training) 

Prediction_Bagging <- predict(bag, EB_Validation, type="class")

Prediction_Bagging_Factor <- as.factor(Prediction_Bagging$class)

# Confusion Matrix
Confusion_Matrix_BT <- confusionMatrix(Prediction_Bagging_Factor, EB_Validation$`Competitive?`)

# Create the Function for Confusion Matrix
draw_confusion_matrix_BT <- function(cm) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('CONFUSION MATRIX for Bagged Tree', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#1c6155')
  text(195, 435, 'No', cex=1.2)
  rect(250, 430, 340, 370, col='#1c615570')
  text(295, 435, 'Yes', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#1c615570')
  rect(250, 305, 340, 365, col='#1c6155')
  text(140, 400, 'No', cex=1.2, srt=90)
  text(140, 335, 'Yes', cex=1.2, srt=90)

  # add in the cm results 
  res <- as.numeric(cm$table)
  text(195, 400, res[1], cex=1.6, font=2, col='white')
  text(195, 335, res[2], cex=1.6, font=2, col='white')
  text(295, 400, res[3], cex=1.6, font=2, col='white')
  text(295, 335, res[4], cex=1.6, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}

# Plot the Confusion Matrix
draw_confusion_matrix_BT(Confusion_Matrix_BT)

Comments: The Overall Accuracy is 0.853 for this Bagged Tree.

Lift Chart

# Set seed
set.seed(1)

# Load Gains
library(gains)

# Compute Gains Chart
gain_2 <- gains(EB_Validation_Pure$`Competitive?`, Prediction_Bagging$prob[,2])
barplot(gain_2$mean.resp / mean(EB_Validation_Pure$`Competitive?`), names.arg = gain_2$depth, xlab = "Percentile",
ylab = "Mean Response", main = "Decile-wise lift chart", col = "#1c6155")

Comments:

d. Run a random forest with argument mtry = 4

# Set seed
set.seed(1)

# Load Package
library(randomForest)

# Predictions

Random_Forest <- randomForest(EB_Training$`Competitive?` ~ ., data = EB_Training, mtry = 4)

Prediction_RF <- predict(Random_Forest, EB_Validation, type="class")

Prediction_RF_Prob <- predict(Random_Forest, EB_Validation, type="prob")

# Confusion Matrix
Confusion_Matrix_RF <- confusionMatrix(data = Prediction_RF, reference = EB_Validation$`Competitive?`)

# Create the Function for Confusion Matrix
draw_confusion_matrix_RF <- function(cm) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('CONFUSION MATRIX for Random Forest', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#1c6155')
  text(195, 435, 'No', cex=1.2)
  rect(250, 430, 340, 370, col='#1c615570')
  text(295, 435, 'Yes', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#1c615570')
  rect(250, 305, 340, 365, col='#1c6155')
  text(140, 400, 'No', cex=1.2, srt=90)
  text(140, 335, 'Yes', cex=1.2, srt=90)

  # add in the cm results 
  res <- as.numeric(cm$table)
  text(195, 400, res[1], cex=1.6, font=2, col='white')
  text(195, 335, res[2], cex=1.6, font=2, col='white')
  text(295, 400, res[3], cex=1.6, font=2, col='white')
  text(295, 335, res[4], cex=1.6, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}

# Plot the Confusion Matrix
draw_confusion_matrix_RF(Confusion_Matrix_RF)

Comments: The Overall Accuracy is 0.877 for this Random Forest.

Lift Chart

# Load Gains
library(gains)

# Compute Gains Chart
gain_3 <- gains(EB_Validation_Pure$`Competitive?`, Prediction_RF_Prob[,2])

barplot(gain_3$mean.resp / mean(EB_Validation_Pure$`Competitive?`), names.arg = gain_3$depth, xlab = "Percentile",
ylab = "Mean Response", main = "Decile-wise lift chart", col = "#1c6155")

Comments:

Compare the bagged tree to the random forest in terms of validation accuracy and lift on first decile. How are the two methods conceptually different?

We can see that the bagged tree and the random forest are quite similar (0.853 VS 0.877). We can see that the first decile on both are also quite similar.

Ex 15.3

Customer Rating of Breakfast Cereals

Data Preprocessing

Remove all cereals with missing values

# Import Dataset Cereals.csv
Cereals <- fread("data/Cereals.csv")

Count_Before <- as.numeric(nrow(Cereals))

# NA Omit Cereals
Cereals <- na.omit(Cereals)

Count_After <- as.numeric(nrow(Cereals))

Count_NA <- Count_Before - Count_After

print(paste("We removed", Count_NA, "rows containing NA value"))
FALSE [1] "We removed 3 rows containing NA value"

a. Apply hierarchical clustering to the data using Euclidian distance to the normalized measurements

# Processing Dataset for Hclust
Scaling <- preProcess(Cereals, method = c("center","scale"))
Cereals_Preprocess <- predict(Scaling, Cereals)

# Computing Distance - Euclidean
Distance_Euclidian <- dist(Cereals_Preprocess, method = "euclidean")

# Computing Hierarchical Cluster - Single Linkage
hc1 <- hclust(Distance_Euclidian, method = "single")
plot(hc1, hang = -1, ann = TRUE)

# Computing Hierarchical Cluster - Complete Linkage
hc2 <- hclust(Distance_Euclidian, method = "complete")
plot(hc2, hang = -1, ann = TRUE)

# Computing Hierarchical Cluster - Centroid Linkage
hc3 <- hclust(Distance_Euclidian, method = "centroid")
plot(hc3, hang = -1, ann = TRUE)